import scrapy
import time
import json
from bsws_spider.items import BlogItem
class CsdnSpider(scrapy.Spider):
    name='csdn'
    allowed_domains=['csdn.net']# 允许的域名范围

    headers = {
        'cookie': 'uuid_tt_dd=10_35506074690-1543408047225-128682; _ga=GA1.2.1747139185.1546238144; __yadk_uid=YTIMDOf9Dj1604cMXk1BIQRtSaEtuoC7; bdshare_firstime=1552953352430; UN=fanfzj; BT=1553358121270; dc_session_id=10_1553869355729.263729; smidV2=201812021611467b57a701e3d1ab90103734fe8cb98c5300121369440d44160; Hm_lvt_6bcd52f51e9b3dce32bec4a3997715ac=1554650343,1554650343,1554650343,1557066477; Hm_ct_6bcd52f51e9b3dce32bec4a3997715ac=6525*1*10_35506074690-1543408047225-128682!5744*1*fanfzj; c-login-auto=15; TY_SESSION_ID=56cbc0bc-633b-444f-90ef-36d55e0e4c42; SESSION=97fc1b4d-5a37-4230-a72e-2b5bccbcf684; dc_tos=preblk; ADHOC_MEMBERSHIP_CLIENT_ID1.0=80c382a2-1139-938a-2cb5-1224f4edeaea',
        'referer': 'https://blog.csdn.net/'
    }

    def start_requests(self):
        yield scrapy.Request('https://blog.csdn.net/api/articles?type=more&category=home&shown_offset={}'.format(int(time.time()*1000000)),callback=self.parse,headers=self.headers)

    def parse(self,response):
        res = response.text
        json_dict = json.loads(res)
        articles = json_dict.get('articles', [])
        for a_item in articles:
            item=BlogItem()
            item['title']=a_item['title']
            item['labels']=a_item.get('tag','')
            item['author']=a_item['nickname']
            item['category']=a_item['category']
            yield scrapy.Request(a_item['url'],headers=self.headers,callback=self.parse_info,meta={'item':item})
        #状态判断
        status = json_dict.get('status', False)
        if status == 'true':
            shown_offset = int(json_dict.get('shown_offset', int(time.time() * 1000000)))
            if shown_offset<=int(time.time() * 1000000):
                shown_offset=int(time.time() * 1000000)+1
            url = 'https://blog.csdn.net/api/articles?type=more&category=home&shown_offset={0}'.format(shown_offset)
            print(url)
            yield scrapy.Request(url,callback=self.parse,headers=self.headers)

    def parse_info(self,response):
        item=response.meta.get('item')
        # fbrq
        date = response.xpath('//span[@class="time"]/text()').extract_first()
        item['fbrq'] = time.strftime('%Y-%m-%d', time.strptime(date, '%Y年%m月%d日 %H:%M:%S'))
        # read-count
        item['read_num'] = response.xpath('//span[@class="read-count"]/text()').extract_first().strip('阅读数： ')
        # content
        item['contents'] = response.xpath('string(//*[@id="content_views"])').extract_first().strip()
        item['href']=response.url
        return item